setwd("~/Desktop/working-with-lyle/Formality_Project")#set our WD
if (!require("pacman")) install.packages("pacman") #run this if you don't have pacman
library(pacman)
pacman::p_load(tidyverse,rlang, zoo, lubridate, plotrix, ggpubr, caret, broom, kableExtra, reactable, install = T)
#use pacman to load packages quickly palette_map = c("#3B9AB2", "#EBCC2A", "#F21A00")
palette_condition = c("#ee9b00", "#bb3e03", "#005f73")
plot_aes = theme_classic() +
theme(legend.position = "top",
legend.text = element_text(size = 12),
text = element_text(size = 16, family = "Futura Medium"),
axis.text = element_text(color = "black"),
axis.line = element_line(colour = "black"),
axis.ticks.y = element_blank()) table_model = function(model_data) {
model_data %>%
tidy() %>%
rename("SE" = std.error,
"t" = statistic,
"p" = p.value) %>%
kable() %>%
kableExtra::kable_styling()
}df <- read_csv('books_cleaned_LIWC.csv') #read in the data tidy_df <- df %>%
group_by(ORIG_PUBL_DATE) %>% ###grouping by the year
summarise_at(vars("Analytic","WPS","BigWords","Period"), funs(mean, std.error),) #pulling the means and SEs for our variables of interest
# Get the mean values for the year 1933
year_means <- tidy_df %>%
filter(ORIG_PUBL_DATE == 1933)
#create centered variables on 1857
tidy_df$Analytic_centered <- tidy_df$Analytic_mean - 53.1
tidy_df$WPS_centered <- tidy_df$WPS_mean - 12.52
tidy_df$BigWords_centered <- tidy_df$BigWords_mean - 16.06
tidy_df$Period_centered <- tidy_df$Period_mean - 8.13df %>%
select(Filename) %>%
dplyr::summarize(n = n()) %>%
reactable::reactable(striped = TRUE)auth_sex <- df %>%
select(AUTH_GENDER,Filename) %>%
unique() %>%
group_by(AUTH_GENDER) %>%
dplyr::summarize(n = n()) %>%
reactable::reactable(striped = TRUE)
auth_sex WC_sex <- df %>%
select(AUTH_GENDER,WC) %>%
unique() %>%
group_by(AUTH_GENDER) %>%
dplyr::summarize(mean = mean(WC)) %>%
reactable::reactable(striped = TRUE)
WC_sex#Plot our smoothed data
#we are using Non-tidy data here to capture the individual variation
#Analytic Thinking
Analytic_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=Analytic, group=1)) +
ggtitle("Analytic Thinking") +
geom_point(color = "dodgerblue3", alpha = 0.2) +
geom_smooth(method = "loess", span = 0.50 )+
plot_aes +
labs(x = "Year", y = '% of Total Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=70,label="
intercept =
estimate = -0.0484
p-value = 0.0785
", size = 3.5)
#Bigwords
Bw_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=BigWords, group=1)) +
ggtitle("Big Words (Letters > 6)") +
geom_point(color = "dodgerblue3", alpha = 0.2) +
geom_smooth(method = "loess", span = 0.50 )+
plot_aes +
labs(x = "Year", y = '% of Total Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=20,label="
estimate = -0.0095
p-value = 0.0236
", size = 3.5)
#Periods
period_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=Period, group=1)) +
ggtitle("Period Usage") +
geom_point(color = "dodgerblue3", alpha = 0.2) +
geom_smooth(method = "loess", span = 0.50 )+
plot_aes +
labs(x = "Year", y = '% of Total Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=11,label="
estimate = 0.0167
p-value < .001
", size = 3.5)
#words per sentence
wps_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=WPS, group=1)) +
ggtitle("Words per Sentence") +
geom_point(color = "dodgerblue3", alpha = 0.2) +
geom_smooth(method = "loess", span = 0.70 )+
plot_aes +
labs(x = "Year", y = '# of Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=25,label="
estimate = -0.0348
p-value < .001
", size = 3.5)
smooth_graphs <- ggpubr::ggarrange(Analytic_smooth,Bw_smooth,period_smooth,wps_smooth,
ncol=2, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(smooth_graphs,
top = text_grob("Smooth Formality Graphs", color = "black", face = "bold", size = 20),
bottom = text_grob(
"Note. Horizontal shading represents Standard Error."
, color = "Black",
hjust = 1.0, x = 1, face = "italic", size = 14))Analytic_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=Analytic_mean, group=1)) +
ggtitle("Analytic Thinking") +
geom_point(color = "dodgerblue3", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.90 )+
plot_aes +
labs(x = "Year", y = '% of Total Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=60,label="
estimate = -0.0404
p-value = 0.1339
", size = 3.5)
#Bigwords
Bw_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=BigWords_mean, group=1)) +
ggtitle("Big Words N > 6") +
geom_point(color = "dodgerblue3", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.60 )+
plot_aes +
labs(x = "Year", y = '% of Total Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=17,label="
estimate = -0.0075
p-value = 0.0999
", size = 3.5)
#Periods
period_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=Period_mean, group=1)) +
ggtitle("Period Usage") +
geom_point(color = "dodgerblue3", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.60 )+
plot_aes +
labs(x = "Year", y = '% of Total Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=8,label="
estimate = 0.0166
p-value < .001
", size = 3.5)
#words per sentence
wps_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=WPS_mean, group=1)) +
ggtitle("Words per Sentence") +
geom_point(color = "dodgerblue3", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.90 )+
plot_aes +
labs(x = "Year", y = '# of Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=16,label="
estimate = -0.0347
p-value < .001
", size = 3.5)
tidy_smooth_graphs <- ggpubr::ggarrange(Analytic_smooth_tidy,Bw_smooth_tidy,
period_smooth_tidy,wps_smooth_tidy,
ncol=2, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(tidy_smooth_graphs,
top = text_grob("Smooth Formality Graphs (grouped by year)", color = "black", face = "bold", size = 20),
bottom = text_grob(
"Note. Horizontal shading represents Standard Error.
Estimates show are from centered analyses (centered on 1933; first year in the dataset)."
, color = "Black",
hjust = 1.05, x = 1, face = "italic", size = 16))Plotting the data by year (one data point per year).
Analytic <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=Analytic_mean, group=1)) +
geom_line(colour = "dodgerblue3") +
geom_ribbon(aes(ymin=Analytic_mean-Analytic_std.error, ymax=Analytic_mean+Analytic_std.error), alpha=0.2) +
ggtitle("Analytic Thinking") +
plot_aes +
labs(x = "Year", y = 'Standardized score') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
#WPS
WPS <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=WPS_mean, group=1)) +
geom_line(colour = "dodgerblue3") +
geom_ribbon(aes(ymin=WPS_mean-WPS_std.error, ymax=WPS_mean+WPS_std.error), alpha=0.2) +
ggtitle("WPS") +
plot_aes +
labs(x = "Year", y = '# of Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
#BigWords
BigWords <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=BigWords_mean, group=1)) +
geom_line(colour = "dodgerblue3") +
geom_ribbon(aes(ymin=BigWords_mean-BigWords_std.error, ymax=BigWords_mean+BigWords_std.error), alpha=0.2) +
ggtitle("Big Words N > 6") +
plot_aes +
labs(x = "Year", y = '% of Total Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
#period frequency
Period <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=Period_mean, group=1)) +
geom_line(colour = "dodgerblue3") +
geom_ribbon(aes(ymin=Period_mean-Period_std.error, ymax=Period_mean+Period_std.error), alpha=0.2) +
ggtitle("Period-usage") +
plot_aes +
labs(x = "Year", y = '% of Total Words') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
#raw graphs
raw_graphs <- ggpubr::ggarrange(Analytic,BigWords,Period,WPS,ncol=2, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(raw_graphs,
top = text_grob("Raw Formality Graphs (grouped by year)", color = "black", face = "bold", size = 20),
bottom = text_grob("Note.Graphs are of books in the collection"
, color = "Black",
hjust = 1.7, x = 1, face = "italic", size = 16))Models presented in order: Raw data, aggregated by year, centered on 1857
#Raw Data
AT_RAW <- lm(Analytic ~ ORIG_PUBL_DATE, data = df)
#Tidy Data
AT_TIDY <- lm(Analytic_mean ~ ORIG_PUBL_DATE, data = tidy_df)
#centered
AT_centered <- lm(Analytic_centered ~ ORIG_PUBL_DATE, data = tidy_df)
table_model(AT_RAW)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 147.5605 | 54.4342 | 2.711 | 0.0069 |
| ORIG_PUBL_DATE | -0.0484 | 0.0275 | -1.763 | 0.0785 |
table_model(AT_TIDY)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 131.7613 | 52.7234 | 2.499 | 0.0144 |
| ORIG_PUBL_DATE | -0.0404 | 0.0267 | -1.513 | 0.1339 |
table_model(AT_centered)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 78.6613 | 52.7234 | 1.492 | 0.1394 |
| ORIG_PUBL_DATE | -0.0404 | 0.0267 | -1.513 | 0.1339 |
BW_Raw <- lm(BigWords ~ ORIG_PUBL_DATE, data = df)
BW_Tidy <- lm(BigWords_mean ~ ORIG_PUBL_DATE, data = tidy_df)
BW_centered <- lm(BigWords_centered ~ ORIG_PUBL_DATE, data = tidy_df)
table_model(BW_Raw)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 34.9153 | 8.3032 | 4.205 | 0.0000 |
| ORIG_PUBL_DATE | -0.0095 | 0.0042 | -2.270 | 0.0236 |
table_model(BW_Tidy)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 30.9375 | 8.9125 | 3.471 | 0.0008 |
| ORIG_PUBL_DATE | -0.0075 | 0.0045 | -1.663 | 0.0999 |
table_model(BW_centered)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 14.8775 | 8.9125 | 1.669 | 0.0987 |
| ORIG_PUBL_DATE | -0.0075 | 0.0045 | -1.663 | 0.0999 |
#Periods
Period_Raw <- lm(Period ~ ORIG_PUBL_DATE, data = df)
Period_Tidy <- lm(Period_mean ~ ORIG_PUBL_DATE, data = tidy_df)
Period_centered <- lm(Period_centered ~ ORIG_PUBL_DATE, data = tidy_df)
table_model(Period_Raw)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | -25.4425 | 5.0328 | -5.055 | 0 |
| ORIG_PUBL_DATE | 0.0167 | 0.0025 | 6.570 | 0 |
table_model(Period_Tidy)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | -25.3173 | 4.4555 | -5.682 | 0 |
| ORIG_PUBL_DATE | 0.0166 | 0.0023 | 7.377 | 0 |
table_model(Period_centered)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | -33.4473 | 4.4555 | -7.507 | 0 |
| ORIG_PUBL_DATE | 0.0166 | 0.0023 | 7.377 | 0 |
#WPS
WPS_Raw <- lm(WPS ~ ORIG_PUBL_DATE, data = df)
WPS_Tidy <- lm(WPS_mean ~ ORIG_PUBL_DATE, data = tidy_df)
WPS_centered <- lm(WPS_centered ~ ORIG_PUBL_DATE, data = tidy_df)
table_model(WPS_Raw)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 81.6724 | 9.6359 | 8.476 | 0 |
| ORIG_PUBL_DATE | -0.0348 | 0.0049 | -7.159 | 0 |
table_model(WPS_Tidy)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 81.4208 | 8.7646 | 9.290 | 0 |
| ORIG_PUBL_DATE | -0.0347 | 0.0044 | -7.823 | 0 |
table_model(WPS_centered)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 68.9008 | 8.7646 | 7.861 | 0 |
| ORIG_PUBL_DATE | -0.0347 | 0.0044 | -7.823 | 0 |